- Do all rows / columns have the same percentage of missing values?
- Are there correlations between missing rows / columns? (If a value is missing in one column it is likely to be missing in another column.)
library(tidyverse)
# Add NAs to mtcars dataset
set.seed(5702)
mycars <- mtcars
mycars[,"gear"] <- NA
mycars[10:20, 3:5] <- NA
for (i in 1:10) mycars[sample(32,1), sample(11,1)] <- NA
colSums(is.na(mycars)) %>%
sort(decreasing = TRUE)
## gear disp hp drat am cyl qsec vs mpg wt carb
## 32 12 12 11 3 1 1 1 0 0 0
rowSums(is.na(mycars)) %>%
sort(decreasing = TRUE)
## Merc 450SE Merc 450SLC Lincoln Continental Fiat 128
## 5 5 5 5
## Merc 280 Merc 280C Merc 450SL Cadillac Fleetwood
## 4 4 4 4
## Chrysler Imperial Honda Civic Toyota Corolla Lotus Europa
## 4 4 4 3
## AMC Javelin Fiat X1-9 Mazda RX4 Mazda RX4 Wag
## 2 2 1 1
## Datsun 710 Hornet 4 Drive Hornet Sportabout Valiant
## 1 1 1 1
## Duster 360 Merc 240D Merc 230 Toyota Corona
## 1 1 1 1
## Dodge Challenger Camaro Z28 Pontiac Firebird Porsche 914-2
## 1 1 1 1
## Ford Pantera L Ferrari Dino Maserati Bora Volvo 142E
## 1 1 1 1
geom_tile()
library(tidyverse)
tidycars <- mycars %>%
rownames_to_column("id") %>%
gather(key, value, -id) %>%
mutate(missing = ifelse(is.na(value), "yes", "no"))
ggplot(tidycars, aes(x = key, y = fct_rev(id), fill = missing)) +
geom_tile(color = "white") +
ggtitle("mtcars with NAs added") +
scale_fill_viridis_d() + # discrete scale
theme_bw()
mi::missing_data.frame()
library(mi)
x <- missing_data.frame(mycars)
image(x)
(gear not shown since all are missing)
geom_tile()
ggplot(tidycars, aes(x = key, y = fct_rev(id), fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "grey80", high = "red", na.value = "black") + theme_bw()
geom_tile() with standardized variables
tidycars <- tidycars %>% group_by(key) %>%
mutate(Std = (value-mean(value, na.rm = TRUE))/sd(value, na.rm = TRUE)) %>% ungroup()
ggplot(tidycars, aes(x = key, y = fct_rev(id), fill = Std)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", mid = "white", high ="yellow", na.value = "black") + theme_bw()
reordered by number of missing
# convert missing to numeric so it can be summed up
tidycars <- tidycars %>%
mutate(missing2 = ifelse(missing == "yes", 1, 0))
ggplot(tidycars, aes(x = fct_reorder(key, -missing2, sum), y = fct_reorder(id, -missing2, sum), fill = Std)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", mid = "white", high ="yellow", na.value = "black") + theme_bw()
missing patterns
x <- mi::missing_data.frame(mycars)
## NOTE: In the following pairs of variables, the missingness pattern of the second is a subset of the first.
## Please verify whether they are in fact logically distinct variables.
## [,1] [,2]
## [1,] "disp" "drat"
## [2,] "disp" "qsec"
## [3,] "hp" "drat"
## [4,] "hp" "qsec"
## [5,] "hp" "am"
## [6,] "drat" "qsec"
class(x)
## [1] "missing_data.frame"
## attr(,"package")
## [1] "mi"
x@patterns
## [1] nothing nothing nothing
## [4] nothing nothing nothing
## [7] nothing nothing nothing
## [10] disp, hp, drat disp, hp, drat disp, hp, drat, am
## [13] disp, hp, drat disp, hp, drat, qsec disp, hp, drat
## [16] disp, hp, drat, am disp, hp, drat cyl, disp, hp, drat
## [19] disp, hp, drat disp, hp, drat nothing
## [22] nothing vs nothing
## [25] nothing disp nothing
## [28] hp, am nothing nothing
## [31] nothing nothing
## 8 Levels: nothing vs disp hp, am disp, hp, drat ... cyl, disp, hp, drat
levels(x@patterns)
## [1] "nothing" "vs" "disp"
## [4] "hp, am" "disp, hp, drat" "disp, hp, drat, am"
## [7] "disp, hp, drat, qsec" "cyl, disp, hp, drat"
summary(x@patterns)
## nothing vs disp
## 18 1 1
## hp, am disp, hp, drat disp, hp, drat, am
## 1 7 2
## disp, hp, drat, qsec cyl, disp, hp, drat
## 1 1
(repeated patterns are reduced to one row)
# note: extracat is no longer available
library(extracat)
visna(mycars)
Sorted by most common to least common missing pattern (top to bottom)
visna(mycars, sort = "r")
Sorted by variable with the most to least missing values (left to right)
visna(mycars, sort = "c")
visna(mycars, sort = "b")
df <- read.csv("SAT2010.csv", na.strings = "s",
check.names = FALSE)
head(df)
## DBN School Name Number of Test Takers
## 1 01M292 Henry Street School for International Studies 31
## 2 01M448 University Neighborhood High School 60
## 3 01M450 East Side Community High School 69
## 4 01M458 SATELLITE ACADEMY FORSYTH ST 26
## 5 01M509 CMSP HIGH SCHOOL NA
## 6 01M515 Lower East Side Preparatory High School 154
## Critical Reading Mean Mathematics Mean Writing Mean
## 1 391 425 385
## 2 394 419 387
## 3 418 431 402
## 4 385 370 378
## 5 NA NA NA
## 6 314 532 314
dim(df)
## [1] 460 6
visna(df)
- Are missing patterns correlated with values of another variable?
- Are certain value ranges more likely to be missing?
- Or are there no patterns at all?
- (An explanation may not be available within the dataset.)
Does the proportion of schools with missing data vary by borough?
Data: SAT2010.csv
head(df)
## DBN School Name Number of Test Takers
## 1 01M292 Henry Street School for International Studies 31
## 2 01M448 University Neighborhood High School 60
## 3 01M450 East Side Community High School 69
## 4 01M458 SATELLITE ACADEMY FORSYTH ST 26
## 5 01M509 CMSP HIGH SCHOOL NA
## 6 01M515 Lower East Side Preparatory High School 154
## Critical Reading Mean Mathematics Mean Writing Mean
## 1 391 425 385
## 2 394 419 387
## 3 418 431 402
## 4 385 370 378
## 5 NA NA NA
## 6 314 532 314
df <- df %>% mutate(Borough = str_sub(DBN, 3, 3))
percent_missing <- df %>% group_by(Borough) %>%
summarize(num_schools = n(), num_na = sum(is.na(`Writing Mean`))) %>%
mutate(percent_na = round(num_na/num_schools, 2)) %>%
arrange(-percent_na)
percent_missing
## # A tibble: 5 × 4
## Borough num_schools num_na percent_na
## <chr> <int> <int> <dbl>
## 1 K 141 32 0.23
## 2 Q 73 14 0.19
## 3 M 111 13 0.12
## 4 X 124 14 0.11
## 5 R 11 1 0.09
K, Q, M, X, R
Manhattan
Brooklyn
Queens
The Bronx
Staten Island
K, Q, M, X, R
Manhattan = New York County
Brooklyn = Kings County
Queens = Queens County
The Bronx = Bronx County
Staten Island = Richmond County
K, Q, M, X, R
Manhattan = New York County = “M”
Brooklyn = Kings County = “K”
Queens = Queens County = “Q”
The Bronx = Bronx County = “X”
Staten Island = Richmond County = “R”
percent_missing <- percent_missing %>%
mutate(BoroughName = fct_recode(Borough,
Manhattan = "M",
Brooklyn = "K",
Queens = "Q",
`The Bronx` = "X",
`Staten Island` = "R"))
percent_missing
## # A tibble: 5 × 5
## Borough num_schools num_na percent_na BoroughName
## <chr> <int> <int> <dbl> <fct>
## 1 K 141 32 0.23 Brooklyn
## 2 Q 73 14 0.19 Queens
## 3 M 111 13 0.12 Manhattan
## 4 X 124 14 0.11 The Bronx
## 5 R 11 1 0.09 Staten Island
df <- df %>%
mutate(BoroughName = fct_recode(Borough,
Manhattan = "M",
Brooklyn = "K",
Queens = "Q",
`The Bronx` = "X",
`Staten Island` = "R"))
dfsum <- df %>% group_by(BoroughName) %>%
summarize(Reading = round(mean(`Critical Reading Mean`, na.rm = TRUE), 1),
Math = round(mean(`Mathematics Mean`, na.rm = TRUE), 1),
Writing = round(mean(`Writing Mean`, na.rm = TRUE), 1)) %>%
left_join(percent_missing %>% select(BoroughName, percent_na),
by = "BoroughName") %>%
arrange(desc(percent_na))
dfsumtidy <- dfsum %>% pivot_longer(cols = Reading:Writing,
names_to = "subject",
values_to = "meanscore")
ggplot(dfsumtidy, aes(meanscore, percent_na, color = BoroughName)) + geom_point(size = 2) + facet_wrap(~subject) + theme_bw() +
theme(legend.position = "bottom")
I need to know how much snow fell per day in New York State in February 2017, on a county or more detailed level. I know some government agency measures and reports snowfall and puts the data online.
I need to know how much snow fell per day in New York State in February 2017, on a county or more detailed level. I know some government agency measures and reports snowfall and puts the data online.
Source: https://www.ncdc.noaa.gov/snow-and-ice/daily-snow/NY/snowfall/20170201
Accessed: 2017-10-26
library(tidyverse)
# https://www.ncdc.noaa.gov/snow-and-ice/daily-snow/
# Accessed 2017-10-26
# Screenshot 2017-10-26 18.45.11.png
snow <- read_csv("NY-snowfall-201702.csv",
skip = 1,
na = "M")
# replace "T" (trace) with .005
trace_replace <- function(x) as.numeric(ifelse(x == "T", .005, x))
snow2 <- snow %>%
mutate_at(vars(`Feb 1`:`Feb 28`), trace_replace)
# note: extracat is no longer available
extracat::visna(snow, sort = "r")
# note: extracat is no longer available
extracat::visna(snow, sort = "c")
# note: extracat is no longer available
extracat::visna(snow, sort = "b")
# tidy snow data frame, add missing (yes or no) column
tidysnow <- snow %>%
select(`Station Name`, County, `Feb 1`:`Feb 28`) %>%
gather(key, value, -`Station Name`, -County) %>%
mutate(missing = ifelse(is.na(value), "yes", "no"))
# add a zero to single digit dates so they'll sort
# correctly (Feb 1 becomes Feb 01)
#
tidysnow$key <- ifelse(nchar(tidysnow$key) == 5,
gsub(" ", " 0", tidysnow$key),
tidysnow$key)
ggplot(tidysnow, aes(x = key, y = `Station Name`, fill = missing)) +
geom_raster() + scale_fill_manual(values = c("white", "red")) + scale_x_discrete("February 2017", labels = 1:28) + theme_bw()
theme_dotplot <- theme_bw(16) +
theme(axis.ticks.y = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_line(size = 0.5),
panel.grid.minor.x = element_blank())
na_by_station <- tidysnow %>%
group_by(County, `Station Name`) %>%
summarize(count = sum(is.na(value))) %>%
mutate(type = "station")
mean_na_by_county <- na_by_station %>%
group_by(County) %>%
summarize(count = mean(count)) %>%
add_column(`Station Name` = "mean", .before = "count") %>%
mutate(type = "mean")
combined_na <- bind_rows(na_by_station, mean_na_by_county)
ggplot(combined_na, aes(x = count,
y = reorder(County, count, mean),
color = type)) + geom_point() +
xlab("Number of missing values (out of 28)") +
theme_dotplot +
theme(axis.text.y = element_text(size = rel(.5)))
# change dates to YYYY-MM-DD format
tidysnow$key <- gsub("Feb ", "2017-02-", tidysnow$key)
# calculate number missing by day
missing <- tidysnow %>%
group_by(key) %>%
summarise(sum.na = sum(is.na(value)))
# plot number missing by day
ggplot(missing, aes(x = 1:28, y = sum.na)) +
geom_col(color = "blue", fill = "lightblue") +
scale_x_continuous(breaks = 1:28, labels = missing$key) +
ggtitle("Number of missing values by day") +
xlab("") +
ylab("Number of missing station values (out of 349)") +
theme(axis.text.x = element_text(angle = 90))
# add day of week info
missing <- missing %>%
mutate(dayofweek = weekdays(as.Date(key),
abbreviate = FALSE))
# correct day order
daysinorder <- c("Monday", "Tuesday", "Wednesday",
"Thursday", "Friday", "Saturday",
"Sunday")
# reorder dayofweek
missing$dayofweek <- factor(missing$dayofweek,
levels = daysinorder)
# choose colors
daycolors <- c(rep("#cbc9e2", 5), rep("#2b8cbe", 2))
# plot missing values by day, weekday/weekend colors
ggplot(missing, aes(x = key, y = sum.na, fill = dayofweek)) +
geom_col() +
ggtitle("Number of missing values by day") +
scale_fill_manual(values = daycolors) +
xlab("") +
ylab("Number of missing station values (out of 349)") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90))
# calculate mean snowfall by day
meansnow <- tidysnow %>%
filter(!is.na(value) & (value != "T")) %>%
mutate(value = as.numeric(value)) %>%
group_by(key) %>%
summarise(avg = mean(value))
# calculate average snowfall per station by day
infodf <- right_join(missing, meansnow, by = "key")
# scatterplot of number missing by average snowfall
# What is the pattern?
g <- ggplot(infodf, aes(x = avg, y = sum.na)) +
geom_point() +
xlab("Mean statewide snowfall (in inches)") +
ylab("Number of missing station values (out of 349)")
g
g + theme_classic()